package com.example.spider.provider.why10w.thn21;

import com.example.spider.provider.why10w.AbstractWhy10wListResultHandler;
import com.example.spider.provider.why10w.dto.Why10wDetailTask;
import com.example.spider.provider.why10w.dto.Why10wListTask;
import com.example.spider.task.SpiderTaskQueue;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;

import java.nio.charset.Charset;
import java.util.List;
import java.util.stream.Collectors;

/**
 * 处理列表查询结果
 *
 * @author lym
 */
@Slf4j
//@Component
public class Thn21ListResultHandler extends AbstractWhy10wListResultHandler {

    @Override
    public void startSpiders() {
        // 一共只有 6页，无需自动翻页，第七页格式比较特殊，一页多个问答
        for (int i = 1; i <= 7; i++) {
            List<String> categoryList = List.of("-", "营养健康", "动物世界", "军事交通", "人体的奥秘", "天文地理", "信息科技", "你身边的科技");
            SpiderTaskQueue.putTask(createPageTask(categoryList.get(i), null, i));
        }
    }

    @Override
    protected String buildPageTaskUrl(String firstCategory, String secondCategory, int pageNo) {
        return pageNo == 1 ? "https://www.thn21.com/Article/wai/5551.html" : "https://www.thn21.com/Article/wai/5551_" + pageNo + ".html";
    }

    /**
     * 分析列表结果，创建下载详情任务
     */
    @Override
    public void doHandle(Why10wListTask task, String html) {

        List<Element> titleAndUrlList = Jsoup.parse(html).getElementById("v").getElementsByTag("p")
                .stream().map(e -> e.getElementsByTag("a"))
                .filter(CollectionUtils::isNotEmpty)
                .map(es -> es.get(0))
                .filter(e -> StringUtils.isNoneBlank(e.attr("href")))
                .collect(Collectors.toList());
        titleAndUrlList.parallelStream().forEach(titleAndUrl -> {
            String title = titleAndUrl.text()
                    .replace("◆", "")
                    .replace("&quot;", "\"")
                    .trim();
            String url = titleAndUrl.attr("href");

            // https://www.thn21.com/Article/wai/zhishi/shiwan/kexue_1.html#9
            Why10wDetailTask detailTask = createDetailTask(task,
                    url.replace("/Article/wai/zhishi/shiwan/", ""), title,
                    "https://www.thn21.com" + url);
            // 创建爬取详情的任务
            SpiderTaskQueue.putTask(detailTask);
        });

        repository.saveHtml(task, html);
    }

    protected Why10wDetailTask createDetailTask(Why10wListTask task, String id, String title, String url) {
        return (Why10wDetailTask)super.createDetailTask(task, id, title, url)
                .setCharset(Charset.forName("GBK"));
    }
    protected Why10wListTask createPageTask(String firstCategory, String secondCategory, int pageNo) {
        return (Why10wListTask) super.createPageTask(firstCategory, secondCategory, pageNo)
                .setCharset(Charset.forName("GBK"));
    }

}
